In [1]:
# necessary imports
import pandas as pd
import numpy as np
import ast
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import sys
sys.path.append('../scripts/')
from lda import create_documents, run_ldas, get_topicwords_scores, get_tweet_topic, get_hashtag_topic
In [2]:
# read lda_tweets
df = pd.read_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets.json')
df['username'] = df['user'].apply(lambda x: ast.literal_eval(x).get('screen_name'))
df.head(3)
Out[2]:
created_at id text user extended_tweet retweeted_status tags full_text full_text_processed tokens user_id username
0 2017-08-02 892868367195025408 RT @Beatrix_vStorch: Ja zum #Diesel. https://t... {'id': 2746361571, 'id_str': '2746361571', 'na... 0 {'created_at': 'Wed Aug 02 16:23:23 +0000 2017... diesel Ja zum #Diesel. https://t.co/NCcsfSkBl7 Ja zum #Diesel. https://t.co/NCcsfSkBl ['diesel'] 2746361571 UweWolff966
1 2017-08-02 892890742905081856 RT @Beatrix_vStorch: Ja zum #Diesel. https://t... {'id': 4379225363, 'id_str': '4379225363', 'na... 0 {'created_at': 'Wed Aug 02 16:23:23 +0000 2017... diesel Ja zum #Diesel. https://t.co/NCcsfSkBl7 Ja zum #Diesel. https://t.co/NCcsfSkBl ['diesel'] 4379225363 F_von_Steiner
2 2017-08-03 892911669633966080 #Diesel #Dobrindt's schützende Hand über Autok... {'id': 22900494, 'id_str': '22900494', 'name':... 0 0 diesel #Diesel #Dobrindt's schützende Hand über Autok... #Diesel #Dobrindt's schützende Hand über Autok... ['diesel', 'schützend', 'hand', 'autokonzerne'... 22900494 BaerbelHoehn
In [3]:
# join tweets from same author and same hashtag to one document
documents = create_documents(df)
In [4]:
# run lda for different num_topics and plot coherence scores
model_list, coherence_scores, corpus_tfidf = run_ldas(documents, 10, 15)
fig = px.line(coherence_scores, x='num_topics', y='c_v',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.show()
In [5]:
# get best performing lda model
best_model_index = coherence_scores['c_v'].index(max(coherence_scores['c_v']))
lda_model = model_list[best_model_index]
In [6]:
# print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
Topic: 0 
Words: 0.057*"einschalten" + 0.057*"gleichen" + 0.054*"schlussrunde" + 0.052*"ard" + 0.050*"uhr" + 0.049*"zdf" + 0.049*"kämpfen" + 0.048*"platz" + 0.045*"wahl" + 0.036*"amp"
Topic: 1 
Words: 0.024*"wohl" + 0.015*"wählen" + 0.012*"gefährden" + 0.012*"afd" + 0.011*"ausrechnen" + 0.010*"btw" + 0.010*"danach" + 0.010*"first" + 0.010*"staatliche" + 0.010*"bekämpfen"
Topic: 2 
Words: 0.046*"trump" + 0.016*"wahlarena" + 0.015*"gabriel" + 0.014*"usa" + 0.014*"krieg" + 0.014*"merkel" + 0.013*"amp" + 0.012*"hören" + 0.012*"personal" + 0.010*"pflege"
Topic: 3 
Words: 0.021*"oktoberfest" + 0.019*"bundesregierung" + 0.015*"demokrat" + 0.015*"gauland" + 0.015*"rufen" + 0.014*"schönes" + 0.014*"wahlboykott" + 0.014*"mitglied" + 0.014*"alexander" + 0.011*"hoch"
Topic: 4 
Words: 0.046*"dresden" + 0.034*"frau" + 0.028*"merkel" + 0.026*"dank" + 0.020*"demokratie" + 0.017*"oktoberfest" + 0.015*"entzaubern" + 0.015*"unglaublich" + 0.014*"grüne" + 0.014*"eigen"
Topic: 5 
Words: 0.030*"wort" + 0.025*"vergleichen" + 0.023*"dank" + 0.021*"unterstützung" + 0.021*"zeigen" + 0.019*"afd" + 0.018*"retweete" + 0.018*"türkei" + 0.017*"traudichdeutschland" + 0.017*"erdogan"
Topic: 6 
Words: 0.014*"gegensatz" + 0.013*"leipzig" + 0.013*"linke" + 0.011*"bleiben" + 0.011*"amp" + 0.011*"partei" + 0.011*"ganz" + 0.010*"wer" + 0.010*"darüber" + 0.010*"gar"
Topic: 7 
Words: 0.037*"drohen" + 0.028*"maas" + 0.025*"noafd" + 0.024*"verletzen" + 0.017*"nonazis" + 0.016*"sehen" + 0.015*"btw" + 0.009*"gesetzlich" + 0.009*"mutmaßen" + 0.009*"verstoße"
Topic: 8 
Words: 0.030*"nordkorea" + 0.024*"brexit" + 0.022*"trump" + 0.020*"sagen" + 0.016*"frau" + 0.016*"zusammen" + 0.013*"kritik" + 0.011*"twitter" + 0.011*"verändern" + 0.011*"schande"
Topic: 9 
Words: 0.031*"fakenews" + 0.022*"schäuble" + 0.021*"iaa" + 0.017*"erzählen" + 0.017*"sein" + 0.016*"gehen" + 0.015*"gerechtigkeit" + 0.014*"gut" + 0.014*"merkel" + 0.013*"eigen"
Topic: 10 
Words: 0.033*"terror" + 0.018*"seehofer" + 0.018*"verteidigen" + 0.018*"unternehmen" + 0.017*"obergrenze" + 0.017*"freiheit" + 0.014*"familiennachzug" + 0.013*"vollkommen" + 0.013*"abend" + 0.013*"darumgrün"
Topic: 11 
Words: 0.026*"spaß" + 0.023*"tiefpunkt" + 0.021*"politisch" + 0.017*"ausgeben" + 0.017*"islam" + 0.013*"verrücken" + 0.012*"klimaschutz" + 0.012*"tvduell" + 0.012*"darumgrün" + 0.012*"schulz"
Topic: 12 
Words: 0.025*"info" + 0.025*"völlig" + 0.015*"traudichdeutschland" + 0.014*"rentenniveau" + 0.014*"stoppen" + 0.013*"martinschulz" + 0.012*"altersarmut" + 0.012*"mio" + 0.012*"polizei" + 0.012*"afd"
Topic: 13 
Words: 0.019*"klimawandel" + 0.017*"afd" + 0.016*"traudichdeutschland" + 0.015*"extremwetter" + 0.015*"wegen" + 0.015*"btw" + 0.014*"mehr" + 0.013*"wahllokale" + 0.013*"öffnen" + 0.013*"recht"
Topic: 14 
Words: 0.079*"retweete" + 0.063*"haben" + 0.056*"traudichdeutschland" + 0.051*"wählen" + 0.049*"afd" + 0.034*"btw" + 0.028*"unterstützung" + 0.022*"zeigen" + 0.020*"emobilität" + 0.017*"ziel"
In [4]:
# get words and scores and save
topic_df = get_topicwords_scores(lda_model)
topic_df.to_json('../../data/BTW17_Twitter/lda/topics.json')
In [5]:
# get tweet topics
output_df = get_tweet_topic(lda_model, df)
In [6]:
output_df.head(3)
Out[6]:
created_at id text user extended_tweet retweeted_status tags full_text full_text_processed tokens user_id username topic topic_score
0 2017-08-02 892868367195025408 RT @Beatrix_vStorch: Ja zum #Diesel. https://t... {'id': 2746361571, 'id_str': '2746361571', 'na... 0 {'created_at': 'Wed Aug 02 16:23:23 +0000 2017... diesel Ja zum #Diesel. https://t.co/NCcsfSkBl7 Ja zum #Diesel. https://t.co/NCcsfSkBl ['diesel'] 2746361571 UweWolff966 3 0.533313
1 2017-08-02 892890742905081856 RT @Beatrix_vStorch: Ja zum #Diesel. https://t... {'id': 4379225363, 'id_str': '4379225363', 'na... 0 {'created_at': 'Wed Aug 02 16:23:23 +0000 2017... diesel Ja zum #Diesel. https://t.co/NCcsfSkBl7 Ja zum #Diesel. https://t.co/NCcsfSkBl ['diesel'] 4379225363 F_von_Steiner 3 0.533313
2 2017-08-03 892911669633966080 #Diesel #Dobrindt's schützende Hand über Autok... {'id': 22900494, 'id_str': '22900494', 'name':... 0 0 diesel #Diesel #Dobrindt's schützende Hand über Autok... #Diesel #Dobrindt's schützende Hand über Autok... ['diesel', 'schützend', 'hand', 'autokonzerne'... 22900494 BaerbelHoehn 3 0.761987
In [7]:
# plot number of topics per hashtag
plot_df = output_df.groupby('tags', as_index=False)['topic'].nunique().sort_values(by='topic').reset_index()
plot_df.rename(columns={'tags':'Hashtag', 'topic':'Anzahl Topics'}, inplace=True)
fig = px.bar(plot_df, x='Anzahl Topics', y='Hashtag',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique,
             orientation='h')
fig.show()
In [8]:
# plot number of topics
plot_df = output_df.groupby('topic', as_index=False)['topic'].count().reset_index()
plot_df.rename(columns={'index':'Topic', 'topic':'Häufigkeit'}, inplace=True)
fig = px.bar(plot_df, x='Topic', y='Häufigkeit',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.show()
In [9]:
# plot average topic score
plot_df = output_df.groupby('topic', as_index=False)['topic_score'].mean().reset_index()
plot_df.rename(columns={'index':'Topic', 'topic_score':'Durchschnittlicher Topic Score'}, inplace=True)
fig = px.bar(plot_df, x='Topic', y='Durchschnittlicher Topic Score',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.show()
In [16]:
hashtag_topics = get_hashtag_topic(output_df)
hashtag_topics = hashtag_topics.merge(topic_df, on='topic', how='left')

# save to json
hashtag_topics.to_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')